################################################################################
## Group Identities and Parliamentary Debates: Replication package
## Fiva, Nedregård and Øien (2024)

## Description:

# This file makes the word propensity (q), posterior (rho) and divergence (pi)
# for social background
## # Description:
# - Reads in the dtm and use applied distributed multinomial regression (dmr) to 
# estimate - divergence across white collar/blue collar
# Output is the word propensity (q), posterior (rho) and divergence (pi)

# This code is based on the June 2024 version of "Sample Code" 
# provided on the following webpage:
# https://scholar.harvard.edu/shapiro/publications
# Under: 
# 2019 Gentzkow, Matthew, Jesse M. Shapiro, and Matt Taddy. 
# “Measuring group differences in high-dimensional choices: 
# Method and application to congressional speech.” 
# Econometrica 87, no. 4 (2019): 1307-1340.

################################################################################

# Clear memory and work environment

rm(list = ls())

gc()

# Packages

library(data.table)
library(parallel)
library(distrom)

## The wd is set by master.R

data.dir   <-  "../data/2_processed_data"
output.dir <- "../data/3_model_output"


# Read data----

## Meta
speaker_metadata <- fread(paste(data.dir, "speeches_session_lemma.csv", sep = "/"), 
                          drop = "text",
                          encoding="UTF-8")

speaker_metadata <- speaker_metadata[!is.na(occupation)]

speaker_metadata$occupation <- if_else(speaker_metadata$occupation == "white", 1, 0)


## Document term matrix
dtm.counts <- readRDS(paste(data.dir, "document_term_matrix.rds", sep = "/"))

# Dropping speaches of speakers without information on social background
dtm.counts <- dtm.counts[rownames(dtm.counts) %in% speaker_metadata$pid_session,]


DT <- speaker_metadata

for (j in 1:100) {
  
  speaker_metadata <- DT
  
  print(j)
  
  # Using the equation notation as in the paper.
  ## Our indicator category (A) is white-collar politician, and our reference 
  ## category (B) is blue-collar politicians
  
  
  tot_speachers <- table(speaker_metadata$occupation, speaker_metadata$session)[1,]+table(speaker_metadata$occupation, speaker_metadata$session)[2,]
  frac <- table(speaker_metadata$occupation, speaker_metadata$session)[1,]/tot_speachers
  
  # First create empty vector
  speaker_metadata$random_session <- 0
  
  sessions <- paste0(1981:2020, "-", 1982:2021)
  n_i <- 1:40
  
  for(i in 1:length(n_i)){
    
    speaker_metadata$random_session[speaker_metadata$session==sessions[i]] <-sample(c(1,2), nrow(speaker_metadata[speaker_metadata$session==sessions[i]]), replace=TRUE, prob=c(frac[n_i[i]],(1-frac[n_i[i]])))
    
  }
  
  
  table(speaker_metadata$random_session)
  table(speaker_metadata$random_session, speaker_metadata$session)
  
  
  speaker_metadata$occupation <- speaker_metadata$random_session
  
  #keep only the metadata that can be found in DT
  speaker_metadata[, A := dplyr::if_else(occupation == 1, 1, 0)] 
  
  # keep only the metadata that can be found in dtm 
  speaker_metadata <- speaker_metadata[pid_session %in% rownames(dtm.counts)] 
  
  
  # Construct penalized estimation inputs
  # No intercept model
  
  X           <- sparse.model.matrix(
    
    ~ 0 + session + factor(party) + arb_sos + energi_miljo + fam_kult +	finans +	helse_omsorg +	
      justis +	kom_forvalt +	kontroll_konst +	naering +	trans_kom +	utd_forsk +	
      uten_forsvar +	energi_industri +	familie_kultur_admin +	forbruker_admin +	
      forsvar +	kirke_undervisning +	kirke_utdanning_forskning +	kom_miljo +	
      kommunal +	landbruk +	samferdsel +	sjofart_fisk +	sosial +	
      utenriks_konstitusjon +	utenriks,
    
    data = speaker_metadata
  )
  
  # # Do the QR decomposition
  qx <- qr(as.matrix(X))
  X <- X[, qx$pivot[1:qx$rank]] #Keep independent columns
  
  # # The rows of X are the speaker_id
  rownames(X) <- speaker_metadata$pid_session
  
  # # Keeping only metadata of the speeches in the corpus: 
  X <- X[rownames(dtm.counts), ]
  
  # Create dummies for A interacted with parlamentary period
  A   <- sparse.model.matrix(~ 0 + session:A, data = speaker_metadata)
  
  rownames(A) <- speaker_metadata$pid_session #naming each row with speech name
  
  A <- A[rownames(dtm.counts), ]
  
  mu <- log(rowSums(dtm.counts)) #mu is the shape parameter of the poisson
  
  # Estimate penalized MLE----
  
  # Detect the number of available CPU cores
  n.cores <- detectCores()
  
  # Decide how many cores to leave free (e.g., 1 or 2)
  cores_to_leave_free <- 2
  n.cores <- max(1, n.cores - cores_to_leave_free) # Ensure at least 1 core is used
  
  cl <- makeCluster(n.cores, type = ifelse(.Platform$OS.type == 'unix', 'FORK', 'PSOCK')) 
  
  fit <- dmr( #distributed multinominal regression: https:\\arxiv.org/abs/1311.6139
    cl = cl, 
    covars = cbind(X, A),
    counts = dtm.counts,
    verb = 1, 
    mu = mu,
    free = 1:ncol(X),
    fixedcost = 1e-5,
    lambda.start = Inf,
    lambda.min.ratio = 1e-5,
    nlambda = 100,
    standardize = F
  )
  stopCluster(cl)
  
  
  ############# Coefs 1 (from penalised MLE) #######################
  
  coefs   <- coef(fit, k = log(nrow(X)), corrected = F)
  
  coefs_X <- coefs[colnames(X), ]
  coefs_A <- coefs[colnames(A), ]
  
  # Phrase-time-specific utility loadings for each phrase if spoken by speakers
  # of category A.
  
  I           <- sparse.model.matrix(~ 0 + session, data = speaker_metadata)
  rownames(I) <- speaker_metadata$pid_session
  I           <- I[rownames(dtm.counts), ]
  phi         <- I %*% coefs_A
  
  # Utility
  
  # Compute utility of speech if all speakers have affiliation B
  utility_B        <- cbind(1, X) %*% rbind(coefs['intercept', ], coefs_X)
  # Compute utility of speech if all speakers have affiliation A 
  utility_A        <- utility_B + phi
  
  # Compute utility of speech for observed speakers and 
  # speaker "clones" with the same speech and covariates but the opposite 
  # affiliation. 
  affiliation_matrix       <- replicate(ncol(dtm.counts), rowSums(A))
  affiliation_matrix_clone <- (1 - affiliation_matrix)
  utility_real       <- utility_B + affiliation_matrix * phi
  utility_clone      <- utility_B + affiliation_matrix_clone * phi
  utility            <- rbind(utility_real, utility_clone)
  q                  <- exp(utility) / rowSums(exp(utility))
  
  # Compute expected posterior that speaker and clone is A based on speech.
  # Compute rho through the likelihood ratio, not from the q's.
  party_ratio        <- rowSums(exp(utility_B)) / rowSums(exp(utility_A))
  likelihood_ratio   <- party_ratio * exp(phi)
  rho                <- likelihood_ratio / (1 + likelihood_ratio)
  rho                <- rbind(rho, rho)
  
  expected_posterior <- rowSums(rho * q)
  
  # Indicate A/B affiliation for speakers and clones
  
  affiliation_A <- rowSums(A) > 0
  affiliation_A <- as.matrix(affiliation_A)
  affiliation_A <- rbind(affiliation_A, 1 - affiliation_A)
  affiliation   <- ifelse(affiliation_A, 'A', 'B')
  
  # Indicate session of congress for speakers and clones
  session        <- speaker_metadata$session
  names(session) <- speaker_metadata$pid_session
  session        <- session[rownames(dtm.counts)]
  session        <- as.matrix(session)
  session        <- rbind(session, session)
  
  # Compute divergence
  party_pi <- tapply(expected_posterior, list(affiliation, session), mean) 
  pi       <- .5 * (party_pi['A', ] + (1 - party_pi['B', ]))
  pi       <- data.frame(session = names(pi), pi = pi)
  
  assign(paste("pi1_", j, sep = ""), pi)
  
}

sessions <- c(1982:2021)
totpi <- sessions


for (j in 1:100) {
  totpi <- as.data.frame(cbind(totpi, eval(parse(text=paste("pi1_", j, "[,2]", sep="")))))
  
}

totpi_sort <-  cbind(sessions, t(apply(totpi[-1], 1, 
                                       FUN=function(x) sort(x, decreasing=TRUE))))

#save results
write.csv(totpi_sort, file = paste(output.dir ,'occ_comparty_placebo.csv', sep = "/"), row.names = F, quote = F)








